This project aims to establish a proxy for determining the activity of transcription factors based on the expression levels of genes regulated by those factors.
The null hypothesis posits that the abundance of transcripts serves as a reliable proxy for activity. However, this assumption may not hold for all transcription factors. Some factors are active only when phosphorylated, meaning that their transcripts may be constitutively expressed while activity depends on phosphorylation, correlating instead with the activity of the corresponding kinase.
We will also focus on develop a general model to relate transcription factor activity to transcript abundance. Significant deviations from this model could indicate transcription factors whose activity is not accurately reflected by transcript levels.
Here are the steps that we performed in our analysis
Exploratory analysis
Models …
library(readr)
library(matrixStats)
library(dplyr)
library(ggplot2)
library(grid)
library(gridExtra)
library(caret)
library(glmnet)
library(igraph)
library(sigmoid)
library(vip)
library(ggrepel)
set.seed(123)
Let’s start importing the dataset from PreciseDB that contains theexpression levels for 3,923 genes in E. coli for 278 different condition
# Import file
log_tpm <- read.csv("log_tpm_full.csv", row.names = 1)
log_tpm
Next we want to perform some preprocessing. First we want to exclude:
Condition with knockout genes
Genes with isoforms
+++ WHY +++
# Exclude condition with knockout genes
log_tpm <- log_tpm[, -grep("_del", colnames(log_tpm))]
# Drop genes with isoform
genes_with_isoforms <- grep("_\\d+$", rownames(log_tpm), value = TRUE)
log_tpm <- subset(log_tpm, !(rownames(log_tpm) %in% genes_with_isoforms))
dim(log_tpm)
[1] 4257 188
Now we can proceed to import the regulatory network from RegulonDB that reports the target genes for each regulator.
regulator <- read.table(file="tableDataReg.csv",
header=TRUE,
sep=",")
regulator
We decide to eliminate:
Lines that are not referred to a protein regulator
Relationships reported as Weak or Unknown
regulator <- regulator[which(regulator[, 2] != "ppGpp"), ]
w <- which(trimws(regulator[,7])=="W")
if(length(w)>0){
regulator <- regulator[-w,]
}
w <- which(trimws(regulator[,7])=="?")
if(length(w)>0){
regulator <- regulator[-w,]
}
nrow(regulator)
[1] 4229
There is a discrepancy between the gene identifiers present in the regulatory network obtained from RegulonDB and those in the PreciseDB dataset. While the regulatory network uses gene names, our dataset contained identifiers in the form of Bnums. So it’s better to convert bnums to gene names to facilitate comparison and downstream analysis. In addition we decided to:
Remove unmapped genes
Remove duplicate genes
# Loading files from ECOcyc
map_bnum <- read.delim("mapbnum.txt", header = TRUE)
map_bnum <- map_bnum[c("Gene.Name", "Accession.1")]
# Map between my dataset and the file of ecocyc
log_tpm$gene_number <- rownames(log_tpm)
log_tpm <- merge(log_tpm, map_bnum, by.x = "gene_number", by.y = "Accession.1", all.x = TRUE)
# Rearrange the dataset
log_tpm <- log_tpm[, c("Gene.Name", setdiff(names(log_tpm), "Gene.Name"))]
# Removing unmapped genes bnumber
log_tpm <- subset(log_tpm, !is.na(Gene.Name))
#removing duplicate genes (it also has all expression values equal 0 so very bad)
log_tpm <- subset(log_tpm, !(log_tpm$Gene.Name == "insI2"))
#setting rownames and dropping the first 2 columns
rownames(log_tpm) <- log_tpm$Gene.Name
log_tpm <- log_tpm[,3:ncol(log_tpm)]
#transpose log_tpm
log_tpm <- t(log_tpm)
Now let’s analyze the distribution of the expression level to understand which is the best value to chose: mean, median, maximum and minimum of expression. We also want check if the distributions follow a Gaussian with the Shapiro–Wilk test:
\[ W = \dfrac{\big(\sum^n _ {i=n} a_i x_{(i)} \big ) ^2}{\sum^n _ {i=n} (x_i - \bar{x})^2} \]
Where:
\[ H_o: \text{a sample } x_1, \cdots, x_n \text{ is drawn from a normally distributed population.} \\ H_a: \text{a sample } x_1, \cdots, x_n \text{ is not drawn from a normally distributed population.} \]
# Histograms of Summary Statistics
log_tpm_mean <- data.frame(value = apply(log_tpm, 2, mean))
mean_hist <- ggplot(log_tpm_mean, aes(x = value)) +
geom_histogram(binwidth = 0.5, fill = "skyblue",
color = "black", bins = 100) +
labs(x = "Mean log-TPM", y = "Frequency") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
shapiro.test(log_tpm_mean$value) #not gaussian
Shapiro-Wilk normality test
data: log_tpm_mean$value
W = 0.98717, p-value < 2.2e-16
log_tpm_median <- data.frame(value = apply(log_tpm, 2, median))
median_hist <- ggplot(log_tpm_median, aes(x = value)) +
geom_histogram(binwidth = 0.5, fill = "lightgreen",
color = "black", bins = 100) +
labs(x = "Median log-TPM", y = "Frequency") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
shapiro.test(log_tpm_median$value) #not gaussian
Shapiro-Wilk normality test
data: log_tpm_median$value
W = 0.98578, p-value < 2.2e-16
log_tpm_max <- data.frame(value = apply(log_tpm, 2, max))
max_hist <- ggplot(log_tpm_max, aes(x = value)) +
geom_histogram(binwidth = 0.5, fill = "lavender",
color = "black", bins = 100) +
labs(x = "Max log-TPM", y = "Frequency") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
shapiro.test(log_tpm_max$value) #not gaussian
Shapiro-Wilk normality test
data: log_tpm_max$value
W = 0.99611, p-value = 3.889e-09
log_tpm_min <- data.frame(value = apply(log_tpm, 2, min))
min_hist <- ggplot(log_tpm_min, aes(x = value)) +
geom_histogram(binwidth = 0.5, fill = "lightpink",
color = "black", bins = 100) +
labs(x = "Min log-TPM", y = "Frequency") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
shapiro.test(log_tpm_min$value) #not gaussian
Shapiro-Wilk normality test
data: log_tpm_min$value
W = 0.91615, p-value < 2.2e-16
grid.arrange(mean_hist, median_hist, max_hist, min_hist, nrow = 2, ncol = 2,
top = textGrob("Histograms of Summary Statistics",
gp=gpar(fontsize=16)))
Based on the results of the Shapiro-Wilk tests conducted on the four
distributions and their respective histograms, we can conclude that the
distributions do not follow to a Gaussian (normal) distribution.
Consequently, this implies that chosing measures such as the mean,
median, maximum, or minimum may not match the linear assumption that the
data follows a Gaussian (normal) distribution.
Now we plot some histograms to have a better idea on the number of positive and negative target genes regulated by each regulator.
# Histogram of how many genes are regulated by each gene
positive_reg <- regulator[regulator$X6.function == "+",]
negative_reg <- regulator[regulator$X6.function == "-",]
unique_regulators <- unique(regulator$X3.RegulatorGeneName)
pos_counts <- c()
neg_counts <- c()
for(reg in unique_regulators){
pos_counts <- c(pos_counts,
count(positive_reg[positive_reg$X3.RegulatorGeneName == reg,]))
neg_counts <- c(neg_counts,
count(negative_reg[negative_reg$X3.RegulatorGeneName == reg,]))
}
pos_counts <- unlist(pos_counts)
names(pos_counts) <- unique_regulators
neg_counts <- unlist(neg_counts)
names(neg_counts) <- unique_regulators
pos_counts <- data.frame(value = pos_counts)
shapiro.test(pos_counts$value) #not gaussian
neg_counts <- data.frame(value = neg_counts)
shapiro.test(neg_counts$value) #not gaussian
total_counts <- data.frame(value = pos_counts$value + neg_counts$value)
shapiro.test(total_counts$value) #not gaussian
pos_counts_hist <- ggplot(pos_counts, aes(x = value)) +
geom_histogram(binwidth = 10, fill = "skyblue",
color = "black", bins = 20) +
labs(x = "Positive Regulations Count", y = "Frequency") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
neg_counts_hist <- ggplot(neg_counts, aes(x = value)) +
geom_histogram(binwidth = 10, fill = "lightgreen",
color = "black", bins = 20) +
labs(x = "Negative Regulations Count", y = "Frequency") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
total_counts_hist <- ggplot(total_counts, aes(x = value)) +
geom_histogram(binwidth = 10, fill = "lightpink",
color = "black", bins = 20) +
labs(x = "Total Regulations Count", y = "Frequency") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
grid.arrange(pos_counts_hist, neg_counts_hist, total_counts_hist, nrow = 1)
There are slightly more negative regulator then positive. In addition we can also notice that the majority of regulators have less than 100 target. There is one exception: crp has 300 target.
# Creating adj matrix to plot the network
edge_list <- cbind(RagulatorName = regulator$X3.RegulatorGeneName,
Target = regulator$X5.regulatedName)
graph <- graph_from_edgelist(edge_list)
layout <- layout_with_fr(graph, niter=100)
# Visualizzazione 3D della network
plot.igraph(graph, layout=layout, vertex.label=NA, vertex.size=3, edge.arrow.size=0.2, edge.curved=TRUE, main="E.coli Network", xlim=c(-0.5, 0.5), ylim=c(-1, 1))